{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选用了4种回归模型，最小二乘法 、 岭回归、索套、随机森林\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import model_selection\n",
    "from sklearn.ensemble import RandomForestRegressor as RFR # 随机森林\n",
    "from sklearn.linear_model import LinearRegression as LR # 最小二乘法\n",
    "from sklearn.linear_model import Ridge as R # 岭回归\n",
    "from sklearn.linear_model import Lasso as L # 索套"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:12: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  if sys.path[0] == '':\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  del sys.path[0]\n"
     ]
    }
   ],
   "source": [
    "my = pd.read_csv('maoyanfilms_clean.csv')\n",
    "#逐步取出nan值，保证特征列都没有nan值\n",
    "df1 = my[['films_id','films_type']].dropna()\n",
    "df1['films_type'] = df1['films_type'].apply(lambda x: x.split(',')[0])\n",
    "df2 = pd.merge(df1,my,on='films_id')\n",
    "#datas.loc[:,(datas.columns != '类型') & (datas.columns != '地区') & (datas.columns != '语言')]\n",
    "df3 = df2.loc[:,(df2.columns!='Unnamed: 0')&(df2.columns!='films_type_y')]\n",
    "df4  = df3[['films_id','films_area']].dropna()\n",
    "df4['films_area'] = df4['films_area'].apply(lambda x: x.split(',')[0])\n",
    "df5 = pd.merge(df4,df3,on='films_id')\n",
    "df6 = df5.loc[:,(df5.columns!='films_area_y')]\n",
    "df6['films_duration'] = df6['films_duration'].apply(lambda x: x[:-2])\n",
    "df6['film_first_time'] = df6['film_first_time'].apply(lambda x: str(x).split('-')[0])\n",
    "df7 = df6[['films_id','films_score','films_box_office']].dropna()\n",
    "df8=pd.merge(df7,df6,on='films_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>类型</th>\n",
       "      <th>时长</th>\n",
       "      <th>年份</th>\n",
       "      <th>评分</th>\n",
       "      <th>地区</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>喜剧</td>\n",
       "      <td>122</td>\n",
       "      <td>2010</td>\n",
       "      <td>7.4</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>爱情</td>\n",
       "      <td>117</td>\n",
       "      <td>2009</td>\n",
       "      <td>6.1</td>\n",
       "      <td>中国香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>爱情</td>\n",
       "      <td>128</td>\n",
       "      <td>2012</td>\n",
       "      <td>7.8</td>\n",
       "      <td>中国台湾</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>爱情</td>\n",
       "      <td>112</td>\n",
       "      <td>2012</td>\n",
       "      <td>7.1</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>爱情</td>\n",
       "      <td>101</td>\n",
       "      <td>2007</td>\n",
       "      <td>8.5</td>\n",
       "      <td>中国香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1079</th>\n",
       "      <td>喜剧</td>\n",
       "      <td>80</td>\n",
       "      <td>2019</td>\n",
       "      <td>8.9</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1080</th>\n",
       "      <td>喜剧</td>\n",
       "      <td>98</td>\n",
       "      <td>2019</td>\n",
       "      <td>8.5</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1081</th>\n",
       "      <td>喜剧</td>\n",
       "      <td>84</td>\n",
       "      <td>2019</td>\n",
       "      <td>8.1</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1082</th>\n",
       "      <td>剧情</td>\n",
       "      <td>114</td>\n",
       "      <td>2019</td>\n",
       "      <td>7.9</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1083</th>\n",
       "      <td>动画</td>\n",
       "      <td>91</td>\n",
       "      <td>2019</td>\n",
       "      <td>8.0</td>\n",
       "      <td>中国大陆</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1084 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        类型   时长    年份   评分    地区\n",
       "0      喜剧   122  2010  7.4  中国大陆\n",
       "1      爱情   117  2009  6.1  中国香港\n",
       "2      爱情   128  2012  7.8  中国台湾\n",
       "3      爱情   112  2012  7.1  中国大陆\n",
       "4      爱情   101  2007  8.5  中国香港\n",
       "...    ...  ...   ...  ...   ...\n",
       "1079   喜剧    80  2019  8.9  中国大陆\n",
       "1080   喜剧    98  2019  8.5  中国大陆\n",
       "1081   喜剧    84  2019  8.1  中国大陆\n",
       "1082   剧情   114  2019  7.9  中国大陆\n",
       "1083   动画    91  2019  8.0  中国大陆\n",
       "\n",
       "[1084 rows x 5 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#特征值 构成新的DataFrame\n",
    "myd=df8[['films_type_x','films_duration','film_first_time','films_score_x','films_area_x']]\n",
    "new_col = ['类型','时长','年份','评分','地区']\n",
    "myd.columns = new_col\n",
    "myd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#非数值型 特征值 需要处理一下\n",
    "def extract_feature(myd):\n",
    "    dftype = pd.get_dummies(myd['类型'],prefix='类型')\n",
    "    dfarea = pd.get_dummies(myd['地区'],prefix='地区')\n",
    "    datas = pd.concat([myd[['时长','年份','评分']],dftype],axis=1)\n",
    "    datas = pd.concat([datas,dfarea],axis=1)\n",
    "    return datas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prediction_sorce(df,filmsData): # df作为数据集传入，filmaData是想预测的信息\n",
    "    df.loc[len(df)] = filmsData # 加入最后最后一行，作为想预测的数据使用\n",
    "    datas = extract_feature(df)\n",
    "    x = datas.loc[:,datas.columns != '评分'].values # 选取训练、测试集\n",
    "    y = datas.loc[:,datas.columns == '评分'].values.reshape(-1,1)\n",
    "    x_train, y_train, x_test, y_test = x[:-1], y[:-1], x[-1], y[-1]\n",
    "    x_test = x_test.reshape(1,-1)\n",
    "    Model = model()\n",
    "    Model.fit(x_train,y_train)\n",
    "    y_pred = Model.predict(x_test)\n",
    "    return y_pred,datas\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_score(datas,model):\n",
    "    x = datas.loc[:,datas.columns != '评分'].values\n",
    "    y = datas.loc[:,datas.columns == '评分'].values.reshape(-1,1)\n",
    "    x_train, x_test, y_train, y_test = model_selection.train_test_split(x,y,test_size=0.3)\n",
    "    \n",
    "    Model = model()\n",
    "    Model.fit(x_train,y_train)\n",
    "    score = Model.score(x_test,y_test)\n",
    "    \n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:670: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:9: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  if __name__ == '__main__':\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  import sys\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:670: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "预测评分 7.7\n",
      "预测评分 7.8\n",
      "预测评分 7.6\n",
      "预测评分 6.8\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:670: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n",
      "D:\\Users\\80423\\anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:670: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    }
   ],
   "source": [
    "modellist = [RFR,LR,R,L]\n",
    "for m in modellist:\n",
    "    model = m\n",
    "    filmsData = ['喜剧',80,2019,7.8,'中国大陆']\n",
    "    garden,datas = prediction_sorce(myd,filmsData)\n",
    "    score = predict_score(datas,model)\n",
    "    garden_test = \"{:.2}\".format(garden.flatten()[0])\n",
    "    print('预测评分' + ' ' + str(garden_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
